#Uplift the Web Real Data Step 1 (preparing the data)

#Uplift the Web Megastudy analysis script step 1 

#date: 6/26/2025

# -----------------------------------------------------------------------------
#### STEP 1: LOAD DATA ####
# To obtain the datasets, we downloaded the data in csv Format from Qualtrics
# using the default options plus checking the box for 'Export viewing order 
# data for randomized surveys'.
# The demographics data came from the sample provider

#change below line to your target folder
setwd('~/megastudy stuff/megastudy_real_data/') 
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse,excluder,psych,skimr,splithalfr)

#We need to drop the screener data because they won't have consented yet,
#so we (the research team) are only going to keep a tally of how many people met each 
#inclusion criterion during the screener (for CONSORT diagram)
# research team will remove screener data from baseline survey before sharing, with the next 2 lines
# filter(IRB_2=='I Agree')%>% #dropping all data from those who didn't consent 
# select(-c(UserLanguage:IRB_2))#dropping screening variables 


#loading screener data (just for research team to keep a tally)
screener_names <- read_csv('megastudy+baseline_June+24,+2025_10.49 (VALUES).csv') %>% names
screener_prep <- read_csv("megastudy+baseline_June+24,+2025_10.49 (VALUES).csv", 
                           col_names = screener_names, skip = 3) %>% 
  exclude_preview() %>%
  mutate(participantId= coalesce(participantId,PROLIFIC_PID)) %>% 
  filter(!is.na(participantId)) %>%  #dropping test runs (we added fake PIDs for the generated data)
  mutate(phq_eligible=ifelse(phq_score>=10&!is.na(phq_score),1,0)) %>% 
  mutate(recruitment_platform=ifelse(!is.na(PROLIFIC_PID),'Prolific','CloudResearch')) %>% 
  select(-c('Age_written':'Employment Status',phq_score,help4mental))

## loading demographic data
labeled_bl_names <- read_csv('megastudy+baseline_June+24,+2025_10.49 (LABELLED).csv') %>% names
labeled_bl <- read_csv('megastudy+baseline_June+24,+2025_10.49 (LABELLED).csv', 
                       col_names = screener_names, skip = 3) %>% 
  exclude_preview() %>% deidentify(strict=F)%>% 
  mutate(participantId= coalesce(participantId,PROLIFIC_PID)) %>% 
  filter(!is.na(participantId)) %>% arrange(participantId) %>% distinct(participantId,.keep_all = T)

demog_path <- "~/megastudy stuff/all_connect_demogs_megastudy/" #all demog files from connect
all_demog_file_names <- paste0(demog_path, list.files(demog_path, recursive = TRUE))
a <- bind_rows(lapply(all_demog_file_names, read_csv)) %>% select(
  -c(Sex,`Occupation Field`,`Country Of Residence`))
demogs <-  a %>% 
  mutate(participantId=ParticipantId) %>% 
  select(Age:participantId) %>% 
  bind_rows(labeled_bl %>% 
              filter(!is.na(PROLIFIC_PID)&!is.na(agree_attn)) %>% 
              mutate(participantId= coalesce(participantId,PROLIFIC_PID)) %>% 
              select(Age_written:'Employment Status',help4mental,
                     participantId) %>%  
              rename(Age=Age_written,'Relationship/Marital Status'=Relationship.Marital)) %>% 
  select(-'Current Depression',-'Depression Questionnaire') %>% 
  left_join(labeled_bl %>% 
              select(participantId,help4mental2=help4mental),by='participantId') %>% 
  mutate(help4mental=coalesce(help4mental,help4mental2),.keep='unused') %>% 
  mutate(help4mental=str_replace_all(
    help4mental,fixed('A therapist, coach, or social worker (in person or online)'),
    'A therapist; coach; or social worker (in person or online)'))

skim(demogs)

# -----------------------------------------------------------------------------
#### STEP 2: EXCLUSION ####
# Here, we exclude participants who do not meet our criteria for inclusion
## first, exclusions based on the screener. 

#exclude duplicate responses (keep only first)
screener1 <- screener_prep  %>% 
  arrange(StartDate) %>% 
  filter(Progress>0) %>% #had to at least pass the first page to be counted
  distinct(participantId,.keep_all = T)
nrow(screener_prep)-nrow(screener1)

#exclude responses that failed the bot check
screener2 <- screener1 %>% 
  filter(parse_number(botcheck)!=59402938210|is.na(parse_number(botcheck))|str_length(botcheck)==0) 
nrow(screener1)-nrow(screener2)

#exclude responses who had a PHQ_9 score under 10
screener3 <- screener2 %>% 
  filter(phq_eligible==1)
nrow(screener2)-nrow(screener3)

#exclude responses who didn't consent to participate
screener4 <- screener3 %>% 
  filter(IRB_2==1) #dropping all data from those who didn't consent 
nrow(screener3)-nrow(screener4)

#responses that spent more than 24 hours on the survey are automatically closed.

## now, exclusions from the baseline survey

#loading baseline survey data
megastudy_bl <- screener4 %>% #ensuring baseline is from the first screening survey (in case someone completed multiple times)
  mutate(time='Baseline')  %>% 
  select(-c(RecipientLastName:IRB_2),prolific_id,botcheck) #dropping screening variables 

#exclude responses that didn't reach randomization
bl1 <- megastudy_bl%>% 
  filter(!is.na(group)) 
nrow(megastudy_bl)-nrow(bl1)

#GO TO LINE 539 FOR THE REST OF THE CONSORT!

## finally, exclusions from the 4-week follow-up survey

#loading week 4 follow-up survey data
megastudy_wk4_names <- read_csv("4+week+f_u+megastudy_June+24,+2025_10.57 (VALUES).csv") %>% names
megastudy_wk4 <- read_csv("4+week+f_u+megastudy_June+24,+2025_10.57 (VALUES).csv",
                          col_types = cols(emrgncy_prolific_pid=col_character()),
                          col_names = megastudy_wk4_names, skip = 3)%>% 
  mutate(PROLIFIC_PID= coalesce(PROLIFIC_PID,prolific_pid)) %>% 
  mutate(PROLIFIC_PID= coalesce(PROLIFIC_PID,emrgncy_prolific_pid)) %>% 
  mutate(participantId= coalesce(participantId,PROLIFIC_PID)) %>% 
  filter(!is.na(participantId)) %>%  #dropping test runs
  exclude_preview() 

#exclude duplicate responses
wk4_1 <- megastudy_wk4 %>% 
  arrange(StartDate) %>% 
  distinct(participantId,.keep_all = T)
nrow(megastudy_wk4)-nrow(wk4_1)

#exclude people who completed the follow-up but didn't reach consent in the baseline (this shouldn't happen but just in case)
wk4_2 <- wk4_1 %>% 
  left_join(bl1 %>% select(participantId,group))%>% #first, add baseline group to wk4
  filter(participantId %in% bl1$participantId)
nrow(wk4_1)-nrow(wk4_2)

#for consort, number of people who completed survey
wk4_2 %>%  filter(Finished==T) %>% group_by(group) %>%
  summarize(n())

# -----------------------------------------------------------------------------
#### STEP 3: combining DATASETS ####
# Here, we combine the baseline, week 4, and demographics data

#making a post-test only timepoint
posttest_vars <- bl1 %>% 
  select(participantId,StartDate:IPAddress,group,contains('_post')) %>% 
  rename_with(~ str_remove(., "_post"), everything())%>% 
  mutate(time='Post-test')

#create a baseline only timepoint
bl_final <- bl1 %>% select(-contains("_post")) 

#make week 4 timepoint
# (btw only those randomized will be invited to do week 4)
wk4_final <- wk4_2 %>% 
  mutate(time='Week 4')

#bind together

mega_alltimes_prep <- bind_rows(
  bl_final,posttest_vars, wk4_final) %>% 
  left_join(demogs,by='participantId') %>% #joining demographics
  arrange(StartDate) %>% 
  group_by(participantId) %>% arrange(participantId,time) %>% 
  mutate(pid=cur_group_id()) %>% #create a shorter PID instead of the CloudResearch PID
  fill(c(disability:how_long_depressed,names(demogs),group), #fill baseline demographics into other timepoints
       .direction='downup') %>%  ungroup 

### create PHQ-9,CEQ, agency, hopelessness, fats ###
#if > 33.334 of score items missing for someone, their score is NA
mega_alltimes_prep2 <- mega_alltimes_prep %>% 
  mutate(phq9_score = ifelse(
    rowMeans(is.na(select(., starts_with('phq9_'))))>0.334,NA,
    rowSums(across(starts_with('phq9_')),na.rm=T)),
    fats_score = ifelse(
      rowMeans(is.na(select(., starts_with('fats_'))))>0.334,NA,
      rowSums(across(starts_with('fats_')),na.rm=T)),
    readiness_score = ifelse(
      rowMeans(is.na(select(., starts_with('ready_'))))>0.334,NA,
      rowSums(across(starts_with('ready_')),na.rm=T)),
    hopeless_score = ifelse(
      rowMeans(is.na(select(., ends_with('_beck'))))>0.334,NA,
      rowSums(across(ends_with('_beck')),na.rm=T)),
    agency_score = ifelse(
      rowMeans(is.na(select(., starts_with('agency_'))))>0.334,NA,
      rowSums(across(starts_with('agency_')),na.rm=T)),
    CEQ_score = ifelse(
      rowMeans(is.na(select(., starts_with('ceq'))))>0.334,NA,
      rowSums(across(starts_with('ceq')),na.rm=T)),
    group=fct_relevel(factor(group),'trout',after=0)) %>% 
  mutate(expect_donthelp_r=6-expect_donthelp, #reversing two items
         expect_wontwork_r=6-expect_wontwork,.keep='unused') %>% 
  mutate(dep_expectancies_score = ifelse(
    rowMeans(is.na(select(., starts_with('expect_'))))>0.334,NA,
    rowSums(across(starts_with('expect_')),na.rm=T))) %>% 
  mutate(
    dep_expect_bl=ifelse(time!='Baseline',NA,dep_expectancies_score)) %>% 
  group_by(pid) %>% fill(dep_expect_bl,.direction='down') %>% ungroup() %>% 
  mutate(
    readiness_bl=ifelse(time!='Baseline',NA,readiness_score)) %>% 
  group_by(pid) %>% fill(readiness_bl,.direction='down') %>% ungroup() %>%
  mutate(
    agency_bl=ifelse(time!='Baseline',NA,agency_score)) %>% 
  group_by(pid) %>% fill(agency_bl,.direction='down') %>% ungroup() %>% 
  mutate(
    hopeless_bl=ifelse(time!='Baseline',NA,hopeless_score)) %>% 
  group_by(pid) %>% fill(hopeless_bl,.direction='down') %>% ungroup() %>% 
  mutate(
    fats_bl=ifelse(time!='Baseline',NA,fats_score)) %>% 
  group_by(pid) %>% fill(fats_bl,.direction='down') %>% ungroup() 

##### rename interventions to describe intervention strategy #####
mega_alltimes <- mega_alltimes_prep2 %>% mutate(group=case_when(
  group=='mindful_acceptance'~'Mindful Acceptance',
  group=='mattu_habits'~'5 Habits to Beat Depression',
  group=='llm_story'~'AI-assisted Personalized Narrative',
  group=='therabee'~'Personalized Intervention Recommender',
  group=='savoring'~'Savoring Strategies',
  group=='elevated'~'Moral Elevation',
  group=='trout'~'Passive Control',
  group=='abc_project'~'Behavioral Activation',
  group=='celeste'~'Inner Child Healing Walk',
  group=='dbt'~'Dialectical Behavioral Therapy Skills',
  group=='finding_focus'~'Mindful Attention Skills',
  group=='koko'~'Interactive Cognitive Reappraisal',
  group=='reframe_calm'~'Reframing Negative Thoughts')) %>% 
  mutate(group=factor(group) %>% fct_relevel('Passive Control',after=0))

##### alpha for each outcome #####
#at firs measurement time for that outcome
(phq_alpha <- psych::alpha(mega_alltimes %>% filter(time=='Baseline') %>% 
                             select(starts_with('phq9_'),-phq9_score)))
(fats_alpha <- psych::alpha(mega_alltimes %>% filter(time=='Baseline') %>% 
                              select(starts_with('fats_'),-fats_score)))
readiness_set <- mega_alltimes %>% filter(time=='Baseline') %>% 
  select(starts_with('ready_'),-readiness_score)
(readiness_alpha <- psych::alpha(readiness_set))
(readiness_spearman_brown_alpha <- spearman_brown(readiness_set[,1],readiness_set[,2]))
(hopeless_alpha <- psych::alpha(mega_alltimes %>% filter(time=='Post-test') %>% 
                                  select(ends_with('_beck'))))
(agency_alpha <- psych::alpha(mega_alltimes %>% filter(time=='Baseline') %>% 
                                select(starts_with('agency_'),-agency_score)))
(ceq_alpha <- psych::alpha(mega_alltimes %>% filter(time=='Post-test') %>% 
                             select(starts_with('ceq'),-CEQ_score)))

(expectancy_alpha <- psych::alpha(mega_alltimes %>% filter(time=='Post-test') %>% 
                                    select(starts_with('expect_'))))
# -----------------------------------------------------------------------------

#### STEP 4: DEALING WITH MISSING COVARIATES ####
#replace factors with the 'other',cont with median, ordinal w mode

#replace ages over 120 with NA
mega_alltimes <- mega_alltimes %>% mutate(Age=ifelse(Age>120,NA,Age))

#need to replace curly ' with the regular ' in relationship / marital status
mega_alltimes <- mega_alltimes %>% 
  mutate(across(`Relationship/Marital Status`,~gsub("[\u2018\u2019\u201A\u201B\u2032\u2035]", "'", .x)))

#first label disability variable
mega_alltimes <- mega_alltimes %>% mutate(disability=case_when(
  disability==1~'Yes',disability==2~'No',disability==3~'Prefer not to Answer',
  TRUE~as.character(disability)))

#then put household income in the right order
mega_alltimes$`Household Income` <- str_replace_all(
  mega_alltimes$`Household Income`, "150,0000","150,000")#fixing error on provider side
mega_alltimes$`Household Income` <- factor(
  mega_alltimes$`Household Income`,levels= c("Less than $10,000",
                                             "$10,000-$19,999","$20,000-$29,999","$30,000-$39,999",
                                             "$40,000-$49,999","$50,000-$59,999","$60,000-$69,999",
                                             "$70,000-$79,999","$80,000-$89,999","$90,000-$99,999",
                                             "$100,000-$124,999","$125,000-$149,999","$150,000-$174,999",
                                             "$175,000-$199,999","$200,000-$224,999",
                                             "$225,000-$249,999", "$250,000 or more","Prefer not to say"))
table(mega_alltimes$`Household Income`, useNA = "always")

#then put education in right order 
mega_alltimes$Education <- factor(mega_alltimes$Education,levels=c(
  "No formal education",
  "Less than a high school diploma",
  "High school graduate - high school diploma or the equivalent (for example: GED)",
  "Some college, but no degree" ,
  "Associate degree (for example: AA, AS)" ,
  "Bachelor's degree (for example: BA, AB, BS)",
  "Master's degree (for example: MA, MS, MEng, MEd, MSW, MBA)" ,
  "Doctorate degree (for example: PhD, EdD)"  ,
  "Professional degree (for example: MD, DDS, DVM, LLB, JD)",
  "Prefer not to say"))

#some race variables redundant. fix that. 
mega_alltimes <- mega_alltimes %>% 
  mutate(Race=case_when(
    str_detect(Race,'Asian Indian')~'Asian Indian',
    str_detect(Race,'Chinese')~'Chinese',
    str_detect(Race,'Filipino')~'Filipino',
    str_detect(Race,'Hawaiian')~'Hawaiian',
    str_detect(Race,'Japanese')~'Japanese',
    str_detect(Race,'Korean')~'Korean',
    str_detect(Race,'Samoan')~'Samoan',
    str_detect(Race,'Vietnamese')~'Vietnamese',
    str_detect(Race,'Japanese')~'Japanese',
    str_detect(Race,'not listed')~'Other',
    .default = Race))

#making a pre-imputation demographic dataset (for a table later on)
demog_notimputed <- mega_alltimes %>% filter(time=='Baseline') %>% 
  select(-participantId,-contains('prolific',ignore.case = T)) %>% 
  relocate(c(pid,time),.before=0) %>% deidentify(strict=F)

save(demog_notimputed,file='demogs_before_impute.Rdata')


table(mega_alltimes$Gender, useNA = "always")
mega_alltimes <- mega_alltimes %>%
  mutate(Gender = replace_na(Gender,'Other'))
table(mega_alltimes$Gender, useNA = "always")
# If the other category consists of less than 3 participants, we will need to  
# change "Other" to the mode
mega_alltimes <- mega_alltimes %>% 
  add_count(Gender,name='n_gender') %>% 
  mutate(Gender=factor(ifelse(n_gender<9&Gender=='Other',
                              cur_data() %>% slice_max(n_gender) %>% 
                                slice_head(n=1) %>% pull(Gender),Gender)))

table(mega_alltimes$Gender, useNA = "always")

# Age
# Replace missing with median
table(mega_alltimes$Age, useNA = "always")
median(mega_alltimes$Age, na.rm = T) # Display median
mega_alltimes <- mega_alltimes %>%
  mutate(Age = replace_na(Age, median(Age, na.rm = T)))
table(mega_alltimes$Age, useNA = "always")

# Race
# Replace missing with other

table(mega_alltimes $Race, useNA = "always")
mega_alltimes <- mega_alltimes %>%
  mutate(Race = replace_na(Race,'Prefer not to say'))
table(mega_alltimes$Race, useNA = "always")
# If the prefer not to say category consists of less than 3 participants, we will need to  
# change "Prefer not to say" to the mode
mega_alltimes <- mega_alltimes %>% 
  add_count(Race,name='n_race') %>% 
  mutate(Race=factor(ifelse(n_race<9&Race=='Prefer not to say',
                            cur_data() %>% slice_max(n_race) %>% 
                              slice_head(n=1) %>% pull(Race),Race)))

table(mega_alltimes$Race, useNA = "always")

# Education
# Replace missing with mode

table(mega_alltimes$Education, useNA = "always")
names(sort(-table(mega_alltimes$Education))[1]) # Display mode
mega_alltimes$Education[is.na(mega_alltimes$Education)] <- 
  names(sort(-table(mega_alltimes$Education))[1])
table(mega_alltimes$Education, useNA = "always")


#Relationship/Marital Status
table(mega_alltimes $`Relationship/Marital Status`, useNA = "always")
mega_alltimes <- mega_alltimes %>%
  mutate(`Relationship/Marital Status` = ifelse(
    is.na(`Relationship/Marital Status`),
    "I'd Rather Not Say",`Relationship/Marital Status`))
table(mega_alltimes$`Relationship/Marital Status`, useNA = "always")
# If the other category consists of less than 3 participants, we will need to  
# change "i'd rather not say" to the mode
mega_alltimes <- mega_alltimes %>% 
  add_count(`Relationship/Marital Status`,name='n_relationship') %>% 
  mutate(`Relationship/Marital Status`=factor(ifelse(n_relationship<9&`Relationship/Marital Status`=="I'd Rather Not Say",
                                                     cur_data() %>% slice_max(n_relationship) %>% 
                                                       slice_head(n=1) %>% pull(`Relationship/Marital Status`),`Relationship/Marital Status`)))

table(mega_alltimes$`Relationship/Marital Status`, useNA = "always")


#Political Party
table(mega_alltimes $`Political Party`, useNA = "always")
mega_alltimes <- mega_alltimes %>%
  mutate(`Political Party` = ifelse(
    is.na(`Political Party`),
    "Prefer not to say",`Political Party`))
table(mega_alltimes$`Political Party`, useNA = "always")
# If the prefer not to say category consists of less than 3 participants, we will need to  
# change "prefer not to say" to the mode
mega_alltimes <- mega_alltimes %>% 
  add_count(`Political Party`,name='n_pol_party') %>% 
  mutate(`Political Party`=factor(ifelse(n_pol_party<9&`Political Party`=="Prefer not to say",
                                         cur_data() %>% slice_max(n_pol_party) %>% 
                                           slice_head(n=1) %>% pull(`Political Party`),`Political Party`)))
table(mega_alltimes$`Political Party`, useNA = "always")


#Employment Status
table(mega_alltimes $`Employment Status`, useNA = "always")
mega_alltimes <- mega_alltimes %>%
  mutate(`Employment Status` = ifelse(
    is.na(`Employment Status`),
    "Prefer not to say",`Employment Status`))
table(mega_alltimes$`Employment Status`, useNA = "always")
# If the prefer not to say category consists of less than 3 participants, we will need to  
# change "prefer not to say" to the mode
mega_alltimes <- mega_alltimes %>% 
  add_count(`Employment Status`,name='n_employment') %>% 
  mutate(`Employment Status`=factor(ifelse(n_employment<9&`Employment Status`=="Prefer not to say",
                                           cur_data() %>% slice_max(n_employment) %>% 
                                             slice_head(n=1) %>% pull(`Employment Status`),`Employment Status`)))

table(mega_alltimes$`Employment Status`, useNA = "always")

# Household Income
# Replace missing with mode
table(mega_alltimes$`Household Income`, useNA = "always")
names(sort(-table(mega_alltimes$`Household Income`))[1]) # Display mode
mega_alltimes$`Household Income`[is.na(mega_alltimes$`Household Income`)] <- 
  names(sort(-table(mega_alltimes$`Household Income`))[1])
table(mega_alltimes$`Household Income`, useNA = "always")


#Social ladder replace NA with median
table(mega_alltimes$`Social ladder`, useNA = "always")
mega_alltimes$`Social ladder` <- ifelse(
  is.na(mega_alltimes$`Social ladder`),replace_na(
    median(mega_alltimes$`Social ladder`),na.rm=T),
  mega_alltimes$`Social ladder`)
table(mega_alltimes$`Social ladder`, useNA = "always")


#Disability replace na with Prefer not to answer
table(mega_alltimes$disability, useNA = "always")
mega_alltimes <- mega_alltimes %>%
  mutate(disability = ifelse(
    is.na(disability),
    "Prefer not to Answer",disability))
table(mega_alltimes$disability, useNA = "always")
# If the prefer not to answer category consists of less than 3 participants, we will need to  
# change "prefer not to answer" to the mode
mega_alltimes <- mega_alltimes %>% 
  add_count(disability,name='n_disability') %>% 
  mutate(disability=factor(ifelse(n_employment<9&disability=="Prefer not to Answer",
                                  cur_data() %>% slice_max(n_disability) %>% 
                                    slice_head(n=1) %>% pull(disability),disability)))

table(mega_alltimes$disability, useNA = "always")

#identify responses that said they didn't complete seriously 
#at baseline OR at week 4
notsrs <- c(
  bl1 %>% filter(seriouscheck==2) %>% pull(participantId),
  wk4_2 %>% filter(seriouscheck==2) %>% pull(participantId) %>% 
    unique)


#### *Counting duplicate IP Addresses* ####
bl1 %>% count(IPAddress) %>% count(n==2)  
bl1 %>% count(IPAddress) %>% count(n==3) 
#299 out of 7416 IP addresses were used twice and 9 were used three times

#duplicates that were randomized on both prolific and CloudResearch
prolific_ips <- bl1 %>% filter(!is.na(PROLIFIC_PID)) %>% pull(IPAddress)
connect_ips <- bl1 %>% filter(is.na(PROLIFIC_PID)) %>% pull(IPAddress)

bl1 %>% filter((IPAddress %in% prolific_ips) & IPAddress %in% connect_ips) %>% 
  distinct(IPAddress) #271 of these 299 participants were randomized on Prolific as well as CloudResearch Connect

bl1 %>% add_count(IPAddress) %>% filter(n>1) %>% group_by(IPAddress) %>% 
  arrange(StartDate) %>% mutate(survey_taken=paste0('take',row_number())) %>% 
  select(StartDate,IPAddress,survey_taken) %>% 
  pivot_wider(values_from = StartDate,names_from = survey_taken) %>% 
  mutate(timediff_weeks=as.numeric(as.duration(take2-take1),'weeks')) %>% 
  ungroup %>% 
  summarize(quantile(timediff_weeks)) 
# the median duration between the first and second time being randomized was 3.91 weeks [IQR 3.13 - 4.29], so several participants' 
# four-week follow-up results may have been preceded by taking the baseline survey again

duplicate_ip_pids <- mega_alltimes %>% filter(time=='Baseline') %>% add_count(IPAddress) %>% filter(n>1) %>% pull(pid)
  #some of these were likely  participants who happened to share a device with someone else
  # but many likely completed it multiple times 

mega_alltimes <- mega_alltimes %>% mutate(duplicate_ip=ifelse(pid %in% duplicate_ip_pids,'Yes','No')) %>% 
  group_by(IPAddress) %>% mutate(IP_address_id=cur_group_id()) %>% ungroup()

#- We calculated these numbers in the cleaning script since IP addresses are identifiable.
#- Responses from 299 out of 7416 IP addresses reached randomization twice and 10 did three times
#- 271 of these participants were randomized on Prolific as well as CloudResearch Connect.
#- the median duration between the first and second time being randomized was 3.91 weeks [IQR 3.13 - 4.29], so several participants' four-week follow-up data may have been preceded by taking the baseline survey again, which is not good!
#- Some of these IP duplicates may have come from participants who happened to share a device with someone else, but many people likely completed it multiple times.


# - 74% of duplicate IP addresses had identical age, gender, and race, so we'll assume most of them are the same 
#person taking the survey twice.
samezies <- mega_alltimes %>% filter(duplicate_ip=='Yes',time=='Baseline')%>% 
    group_by(IP_address_id) %>%  
    select(IP_address_id,Age, Gender, Race,StartDate,pid)  %>%
  mutate(StartDate = as.Date(StartDate)) %>%
  arrange(StartDate, .by_group = TRUE) %>%
  summarize(
    min_age = first(Age),
    valid_age_progression = all((Age == min_age) | (Age == min_age + 1 & StartDate > first(StartDate))),
    Gender_same = n_distinct(Gender) == 1,
    Race_same = n_distinct(Race) == 1) %>%
  mutate(all_same = valid_age_progression & Gender_same & Race_same) 

ip_gender_race_age_same_at_bl_ips <- samezies %>%filter(all_same==T) %>% pull(IP_address_id)

ip_gender_race_age_same_at_bl_pids <- mega_alltimes %>% filter(time=='Baseline') %>%
  mutate(ip_gender_race_age_same_at_bl=ifelse(
    IP_address_id %in% ip_gender_race_age_same_at_bl_ips,1,0)) %>% 
  filter(ip_gender_race_age_same_at_bl==T)%>% pull(pid)

mega_alltimes <- mega_alltimes %>% mutate(ip_gender_race_age_same_at_bl=ifelse(
  pid %in% ip_gender_race_age_same_at_bl_pids,1,0))

samezies %>% summarize(
        prop_not_identical = mean(!all_same),
        num_groups = n(),
        num_not_identical = sum(!all_same),
        num_identical = sum(all_same)
    )

mega_alltimes_for_final_consort_prep1 <- mega_alltimes %>%
  group_by(IP_address_id, time) %>%
  arrange(StartDate, .by_group = TRUE) %>%
  filter( ip_gender_race_age_same_at_bl==0 |
            (ip_gender_race_age_same_at_bl==1 & row_number() == 1 )) %>%
  ungroup()  

# above leaves some rows from people who completed wk4 on a different device,
# so here dropping people who didn't complete the baseline
mega_alltimes_for_final_consort_prep2 <- mega_alltimes_for_final_consort_prep1 %>% filter(time == "Baseline") %>%  
  distinct(pid) %>%
  inner_join(mega_alltimes_for_final_consort_prep1, by = "pid")  

#FOR FINAL CONSORT DROP
mega_alltimes_for_final_consort_prep2 %>% filter(time=='Baseline')%>% nrow()

mega_alltimes %>% filter(time=='Baseline') %>% nrow() - 
  mega_alltimes_for_final_consort_prep2 %>% filter(time=='Baseline') %>% nrow()

#for consort, n randomized to each group
mega_alltimes_for_final_consort_prep2 %>% filter(time=='Baseline')  %>% count(group)

#for consort, number of people who completed SSI
mega_alltimes_for_final_consort_prep2 %>% filter(time=='Baseline')  %>%  
  filter(!is.na(`Q559_Page Submit`)) %>% group_by(group) %>%
  summarize(n())

#for consort, number of people who completed survey
mega_alltimes_for_final_consort_prep2 %>% filter(time=='Baseline')  %>%  
  filter(Finished==T) %>% group_by(group) %>%
  summarize(n())


# ---------------------------------------------------------------------------------------------------
#### STEP 5: EXPORTING DATA ####
# Here, we export the data.
# We make two datasets: one with all the variables, and
# another with just the ones needed for our primary
# and secondary variables. Note that before we share data
# we'll go through and redact any identifiable data
# (for example, in written response items)

#saving in Rdata to keep factor levels, but also .csv just in case.
#if using the .csv, you'll need to relevel "group" before running analyses.

mega_alltimes <- mega_alltimes%>% deidentify(strict=F)

mega_allvars<- mega_alltimes %>% select(-participantId,-contains('prolific',ignore.case = T)) %>% 
  relocate(c(pid,time),.before=0) 
mega_allvars %>% write_csv('utw_allvars.csv')
save(mega_allvars,file='utw_allvars.Rdata')


mega_selectvars <- mega_allvars %>% 
  select(pid, time, StartDate, group,Progress, Age:fats_bl,
         starts_with('aha'),star_rating_1,`Q559_Page Submit`,`Q1525_Page Submit`,
         `Q423_Page Submit`,`Social ladder`,disability,program_helped,
         -expect_donthelp_r,-expect_wontwork_r,ffmq_distressing,help4mental,
         ready_important,ready_confident,recruitment_platform,`Duration (in seconds)`,
         att_check=ac1_str_disag,duplicate_ip,IP_address_id,ip_gender_race_age_same_at_bl)
mega_selectvars %>% write_csv('utw_selectvars.csv')
save(mega_selectvars,file="utw_selectvars.Rdata")

